#Download the datasets
friends <- read.csv("friends.csv", sep = ",", header = TRUE)
friends_info <- read.csv("friends_info.csv", sep = ",", header = TRUE)
friends_emotions <- read.csv("friends_emotions.csv", sep = ",", header = TRUE)
library(tidyverse)
# Define the main characters
main_characters <- c("Monica Geller", "Joey Tribbiani", "Chandler Bing",
"Phoebe Buffay", "Rachel Green", "Ross Geller")
# Calculate the distribution of lines spoken by each main character
line_distribution <- friends |>
filter(speaker %in% main_characters) |> # Filter for main characters
group_by(speaker) |> # Group by character
summarise(total_lines = n()) |> # Count the number of lines for each character
arrange(desc(total_lines)) # Sort by total lines
# Output the line distribution
cat("Distribution of lines spoken by main characters:\n")
## Distribution of lines spoken by main characters:
print(line_distribution)
## # A tibble: 6 Ă— 2
## speaker total_lines
## <chr> <int>
## 1 Rachel Green 9312
## 2 Ross Geller 9157
## 3 Chandler Bing 8465
## 4 Monica Geller 8441
## 5 Joey Tribbiani 8171
## 6 Phoebe Buffay 7501
# Create a bar plot for the distribution of lines
ggplot(line_distribution, aes(x = reorder(speaker, total_lines), y = total_lines)) +
geom_bar(stat = "identity", fill = "skyblue") +
theme_minimal() +
labs(
title = "Distribution of Lines Spoken by Main Characters in Friends",
x = "Characters",
y = "Total Lines"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
# Define the main characters
main_characters <- c("Monica Geller", "Joey Tribbiani", "Chandler Bing",
"Phoebe Buffay", "Rachel Green", "Ross Geller")
# Calculate the distribution of utterances spoken by each main character
utterance_distribution <- friends |>
filter(speaker %in% main_characters) |> # Filter for main characters
group_by(speaker) |> # Group by character
summarise(total_utterance = sum(utterance, na.rm = TRUE)) |> # Sum utterances for each character
arrange(desc(total_utterance)) # Sort by total utterances
# Output the utterance distribution
cat("Distribution of utterances spoken by main characters:\n")
## Distribution of utterances spoken by main characters:
print(utterance_distribution)
## # A tibble: 6 Ă— 2
## speaker total_utterance
## <chr> <int>
## 1 Rachel Green 187427
## 2 Ross Geller 182134
## 3 Monica Geller 157199
## 4 Chandler Bing 154488
## 5 Joey Tribbiani 151005
## 6 Phoebe Buffay 132168
# Create a bar plot for the distribution of utterances
ggplot(utterance_distribution, aes(x = reorder(speaker, total_utterance), y = total_utterance)) +
geom_bar(stat = "identity", fill = "lightgreen") +
theme_minimal() +
labs(
title = "Distribution of Utterances Spoken by Main Characters in Friends",
x = "Characters",
y = "Total Utterances"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
# Filter for characters that are not main characters and manully filter out the non charactors
non_main_characters <- friends |>
filter(!(speaker %in% main_characters) &
!is.na(speaker) &
speaker != "#ALL#" &
speaker != "Scene Directions") |>
group_by(speaker) |>
summarise(total_lines = n(),
total_utterance = sum(utterance, na.rm = TRUE)) |>
filter(total_lines > 150 ) |>
arrange(desc(total_lines))
# Create a bar plot for the distribution of lines and utterances
ggplot(non_main_characters, aes(x = reorder(speaker, total_lines), y = total_lines)) +
geom_bar(stat = "identity", fill = "lightcoral") +
theme_minimal() +
labs(
title = "Non-Main Characters with most lines in Friends",
x = "Characters",
y = "Total Lines"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
# Filter the dataset for the three speakers who are not the main characters and have the most lines.
speaker_distribution <- friends |>
filter(speaker %in% c("Janice Litman Goralnik", "Mike Hannigan", "Richard Burke")) |>
group_by(speaker, season) |>
summarise(Number_of_Lines = n(), .groups = "drop")
# Create bins to ensure seasons 1 to 10 are represented
speaker_distribution <- speaker_distribution |>
complete(season = 1:10, speaker, fill = list(Number_of_Lines = 0))
# Plot the distribution of lines for the three speakers
ggplot(speaker_distribution, aes(x = factor(season), y = Number_of_Lines, fill = speaker)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
theme_minimal() +
labs(
title = "Line Distribution for Selected Characters Across Seasons",
x = "Season",
y = "Number of Lines",
fill = "Speaker"
) +
scale_fill_manual(values = c("skyblue", "orange", "lightgreen")) + # Custom colors for speakers
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
)
# Calculate the distribution of lines spoken by each character (main and non-main), excluding "Scene Directions" in each episode.
line_distribution <- friends |>
filter(speaker != "Scene Directions") |> # Exclude "Scene Directions"
group_by(season, episode, speaker) |> # Group by season, episode, and speaker
summarise(total_lines = n(), .groups = 'drop') # Count the number of lines for each speaker
# Find the speaker with the most lines in each episode of each season
most_lines_per_episode <- line_distribution |>
group_by(season, episode) |>
filter(total_lines == max(total_lines))
# Output the result
print(most_lines_per_episode)
## # A tibble: 244 Ă— 4
## # Groups: season, episode [236]
## season episode speaker total_lines
## <int> <int> <chr> <int>
## 1 1 1 Monica Geller 73
## 2 1 2 Ross Geller 68
## 3 1 3 Monica Geller 52
## 4 1 4 Monica Geller 47
## 5 1 5 Ross Geller 40
## 6 1 6 Chandler Bing 58
## 7 1 7 Ross Geller 53
## 8 1 8 Ross Geller 61
## 9 1 9 Monica Geller 48
## 10 1 10 Phoebe Buffay 51
## # ℹ 234 more rows
# Filter for episodes where the speaker with the most lines are non-main characters.
non_main_results <- most_lines_per_episode |>
filter(!(speaker %in% main_characters))
# Output the result
if (nrow(non_main_results) > 0) {
print("Episodes where a non-main character has the most lines:")
print(non_main_results)
} else {
print("No episodes where a non-main character has the most lines.")
}
## [1] "Episodes where a non-main character has the most lines:"
## # A tibble: 2 Ă— 4
## # Groups: season, episode [2]
## season episode speaker total_lines
## <int> <int> <chr> <int>
## 1 6 21 Paul Stevens 44
## 2 9 8 Amy Green 58
# Filter for lines where the speaker is Joey
joey_lines <- subset(friends, speaker == "Joey Tribbiani")
# Count the occurrences of "How you doin?" in the 'text' column (case insensitive)
how_you_doin_count <- sum(grepl("How you doin?", joey_lines$text, ignore.case = TRUE))
# Print the result
cat("Joey says 'How you doin?'", how_you_doin_count, "times.\n")
## Joey says 'How you doin?' 25 times.
# Count occurrences of "How you doin?" in each season
how_you_doin_by_season <- aggregate(
grepl("How you doin?", joey_lines$text, ignore.case = TRUE) ~ joey_lines$season,
data = joey_lines,
FUN = sum
)
# Rename columns for clarity
colnames(how_you_doin_by_season) <- c("season", "count")
# Load ggplot2 library for plotting
library(ggplot2)
# Create a line plot with customized x-axis
ggplot(how_you_doin_by_season, aes(x = season, y = count)) +
geom_line(color = "blue", size = 1) + # Line with color and thickness
geom_point(color = "red", size = 3) + # Points at each data value
labs(
title = "Number of Times Joey Says 'How you doin?' by Season",
x = "Season",
y = "Count"
) +
scale_x_continuous(breaks = 0:10, limits = c(0, 10)) + # X-axis from 0 to 10
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Count occurrences of "I love you" for each main character
love_you_count <- friends |>
filter(speaker %in% main_characters & str_detect(text, "I love you")) |> # Filter main characters and text
group_by(speaker) |> # Group by speaker
summarise(total_count = n()) |> # Count occurrences
arrange(desc(total_count)) # Sort by total count in descending order
# Output the total counts for each character
cat("Occurrences of 'I love you' by each main character:\n")
## Occurrences of 'I love you' by each main character:
print(love_you_count)
## # A tibble: 6 Ă— 2
## speaker total_count
## <chr> <int>
## 1 Monica Geller 45
## 2 Chandler Bing 33
## 3 Ross Geller 31
## 4 Rachel Green 25
## 5 Phoebe Buffay 19
## 6 Joey Tribbiani 11
# Identify the character who said "I love you" the most
most_love = love_you_count |>
filter(total_count == max(total_count))
cat("The character who said 'I love you' the most is:", most_love$speaker, "with", most_love$total_count, "occurrences.\n")
## The character who said 'I love you' the most is: Monica Geller with 45 occurrences.
# Count occurrences of "I love you" for the main characters by season
love_you_by_season <- friends |>
filter(speaker %in% main_characters & str_detect(text, "I love you")) |>
group_by(season, speaker) |>
summarise(total_count = n(), .groups = "drop") |>
arrange(season)
# Create a plot for the counts
ggplot(love_you_by_season, aes(x = season, y = total_count, color = speaker)) +
geom_line(size = 1) + # Add lines for each character
geom_point(size = 3) + # Add points to the lines
theme_minimal() +
labs(
title = "Occurrences of 'I love you' by six main charactors Through the Seasons",
x = "Season",
y = "Total Occurrences"
) +
scale_x_continuous(breaks = 1:10, limits = c(1, 10)) + # Set x-axis from 1 to 10 (seasons)
scale_y_continuous(breaks = seq(0, max(love_you_by_season$total_count), by = 1)) + # Set y-axis as integers
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
fit <- lm(imdb_rating ~ us_views_millions, data = friends_info)
plot(friends_info$us_views_millions, friends_info$imdb_rating)
fit.1 <- lm(us_views_millions ~ imdb_rating, data = friends_info)
plot(friends_info$imdb_rating, friends_info$us_views_millions)
#The model with us_views_millions being x and imbd_rating being y makes more sense as x values are collected 10 years before y values are collected.
#plot the model and find the coefficients for this model.
plot(fit)
summary(fit)
##
## Call:
## lm(formula = imdb_rating ~ us_views_millions, data = friends_info)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.15016 -0.24770 -0.02827 0.22250 1.17155
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.731888 0.119473 64.717 < 2e-16 ***
## us_views_millions 0.028757 0.004613 6.234 2.1e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3699 on 234 degrees of freedom
## Multiple R-squared: 0.1424, Adjusted R-squared: 0.1388
## F-statistic: 38.87 on 1 and 234 DF, p-value: 2.097e-09
#Remove Outliers to see if the four outliers make the model change significantly.
friends_info <- friends_info |>
filter(!(row_number() %in% c(36,37,235, 236)))
fit <- lm(imdb_rating ~ us_views_millions, data = friends_info)
plot(fit)
summary(fit)
##
## Call:
## lm(formula = imdb_rating ~ us_views_millions, data = friends_info)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.14531 -0.24731 -0.02954 0.21320 1.16609
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.691313 0.158578 48.502 < 2e-16 ***
## us_views_millions 0.030419 0.006295 4.832 2.47e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3669 on 230 degrees of freedom
## Multiple R-squared: 0.09217, Adjusted R-squared: 0.08822
## F-statistic: 23.35 on 1 and 230 DF, p-value: 2.467e-06
friends <- read.csv("friends.csv", sep = ",", header = TRUE)
friends_info <- read.csv("friends_info.csv", sep = ",", header = TRUE)
friends_emotions <- read.csv("friends_emotions.csv", sep = ",", header = TRUE)
fit <- lm(imdb_rating ~ us_views_millions, data = friends_info)
plot(fit)
summary(fit)
##
## Call:
## lm(formula = imdb_rating ~ us_views_millions, data = friends_info)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.15016 -0.24770 -0.02827 0.22250 1.17155
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.731888 0.119473 64.717 < 2e-16 ***
## us_views_millions 0.028757 0.004613 6.234 2.1e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3699 on 234 degrees of freedom
## Multiple R-squared: 0.1424, Adjusted R-squared: 0.1388
## F-statistic: 38.87 on 1 and 234 DF, p-value: 2.097e-09
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
boxcox(fit)
fit.reciprocal <- lm(1 /imdb_rating ~ us_views_millions, data = friends_info)
fit.log <- lm(log(imdb_rating) ~ us_views_millions, data = friends_info)
fit.sqrt <- lm(sqrt(imdb_rating) ~ us_views_millions, data = friends_info)
#Plot the transformation models and see if there are improvement in the residuals
plot(fit.reciprocal)
plot(fit.log)
plot(fit.sqrt)